Home location is the most important factor in home value. Identical homes can have dramatically different values due to the location. One of the factors that home buyers concern is the school district.
The average prices of nearby houses are higher when the high schools get higher ranking.
if (require(dplyr) == FALSE) {install.packages('dplyr')}
if (require(ggmap) == FALSE) {install.packages('ggmap')}
if (require(XML) == FALSE) {install.packages('XML')}
if (require(gridExtra) == FALSE) {install.packages('gridExtra')}
if (require(psych) == FALSE) {install.packages('psych')}
library(dplyr)
library(ggmap)
library(ggplot2)
library(XML)
library(gridExtra)
library(psych)
if (require(devtools) == FALSE) {install.packages('devtools')}
library(devtools)
if (require(rvest) == FALSE) {devtools::install_github("hadley/rvest")}
library(rvest)
library(stringr)
library(reshape2)
1) The rank and address of high school were web-scraped from the U.S. News rankings using rvest package.
Website: http://www.usnews.com/education/best-high-schools/national-rankings
The U.S. News rankings include data on more than 19,400 public high schools in 50 states and the District of Columbia. However, only around 2100 public schools have ranking.
# Extract school rank, school name and address from usnews.com
web <- 'http://www.usnews.com/education/best-high-schools/national-rankings/spp+100/page+'
school <- data.frame(rankingscore = numeric(0), schoolname = character(0),
street = character(0), csz = character(0))
for (i in 1:21){
url <- paste0(web, i)
website <- html(url)
rankingscore <- website %>% html_nodes("span.rankings-score span") %>% html_text()
if (length(rankingscore)< 100){
rankingscore <- append(rankingscore, rep(NA, 100-length(rankingscore)))
}
schoolname <- website %>% html_nodes("div.school-name a") %>% html_text()
street <- website %>% html_nodes("div.school-street") %>% html_text()
csz <- website %>% html_nodes("div.school-csz") %>% html_text()
all <- data.frame(cbind(rankingscore, schoolname, street, csz),
stringsAsFactors = FALSE)
school <- rbind(school, all)
}
# Convert ranking score into numeric format
school$rankingscore <- as.numeric(str_replace_all(school$rankingscore, "[#,]", ""))
school$schoolname <- str_replace_all(school$schoolname, "(<U\\+200B>)", "")
# split csz into city, state and zipcode
csz <- colsplit(school$csz, ',', c('city', 'sz'))
csz1 <- data.frame(str_split_fixed(str_trim(csz$sz), ' ', n = 2),
stringsAsFactors = FALSE)
names(csz1) <- c('state', 'zipcode')
split_csz<- setNames(data.frame(cbind(csz$city, csz1$state, csz1$zipcode),
stringsAsFactors = FALSE), c('city', 'state', 'zipcode'))
# Combine into final school dataframe
schoolranking <- cbind(school[1:3], split_csz)
2) Zillow Home Value Index (zillowIndex) was used to present the average home price around the ranked high schools.
The Zillow Home Value Index is the median Zestimate home valuation (Zillow’s estimated market value). ZillowIndex was queried through Zillow GetDemographics API. Website: http://www.zillow.com/howto/api/GetDemographics.htm
Due to the limitation of 1000 queries per day by zillow API, the dataset was devided into 3 parts (900, 900 and 300 records respectively). The query was run once per day for each part. The completed dataset was combined using rbind() function.
# Create function that takes zipcode and query the zillow home value index from zillowAPI
getZillow <- function(zip){
url <- paste0("http://www.zillow.com/webservice/GetDemographics.htm?zws-id=X1-ZWz1dx93rz7eob_9fxlx&zip=", zip)
doc <- xmlParse(url)
e <- xpathSApply(doc, path = "//response/pages//attribute/name[contains(text(), 'Zillow Home Value Index')]/..//zip/value", xmlValue)
return (e)
}
# Divide schoolranking into three parts, retrieve zillowindex from zillow API one part a day.
part1 <- schoolranking[1:900, ]
part2 <- schoolranking[901:1800, ]
part3 <- schoolranking[1801:2100, ]
zillowindex1 <- lapply(part1$zipcode, getZillow)
zillowindex1[sapply(zillowindex1, is.null)] <- NA
zillowindex1 <- unlist(zillowindex1)
part1final <- cbind(part1, zillowindex1)
zillowindex2 <- lapply(part2$zipcode, getZillow)
zillowindex2[sapply(zillowindex2, is.null)] <- NA
zillowindex2 <- unlist(zillowindex2)
part2final <- cbind(part2, zillowindex2)
zillowindex3 <- lapply(part3$zipcode, getZillow)
zillowindex3[sapply(zillowindex3, is.null)] <- NA
zillowindex3 <- unlist(zillowindex3)
part3final <- cbind(part3, zillowindex3)
row.names(part1final)<-NULL
row.names(part2final)<-NULL
row.names(part3final)<-NULL
names(part1final) <- c('rank', 'school', 'street', 'city', 'state',
'zipcode', 'zillowindex')
names(part2final) <- c('rank', 'school', 'street', 'city', 'state',
'zipcode', 'zillowindex')
names(part3final) <- c('rank', 'school', 'street', 'city', 'state',
'zipcode', 'zillowindex')
rank <- rbind(part1final, part2final, part3final)
save(rank, file = "C:/temp/rank.rda")
3) The latitude and longitude coordinates of the zipcodes were obtained by ggmap package.
# Obtain lat and lon by geocode() function in ggmap package
lonlat <- geocode(rank$zipcode)
schoolrank <- cbind(rank, lonlat)
save(schoolrank, file = "C:/temp/schoolrank.rda")
The final dataset was saved as .rda file and uploaded to the public folder in dropbox.
The .rda file was downloaded from dropbox directly.
# load("C:/temp/schoolrank.rda")
setInternet2(use = TRUE)
download.file("https://dl.dropboxusercontent.com/u/8963938/schoolrank.rda",
destfile="./schoolrank.rda")
load("./schoolrank.rda")
1) Convert zillowIndex from character into numeric
schoolrank$zillowindex <- as.character(schoolrank$zillowindex)
schoolrank$zillowindex <- as.numeric(schoolrank$zillowindex)
2) Convert rank column into integer
schoolrank$rank <- as.integer(schoolrank$rank)
3) Remove the records in ‘rank’ column that contains NA (without rank)
school <- schoolrank %>% filter (!is.na(rank))
# From 2100 to 2019
4) Remove the records in ‘zillowindex’ column that contains NA (no local zillowIndex on that zipcode)
school <- school %>% filter (!is.na(zillowindex))
# From 2019 to 1472
5) Remove the state that has less than 3 schools (TX and DE)
state.lt3 <- as.list(school %>% group_by(state) %>%
summarise(count = n()) %>% filter(count < 3) %>%
select(state))
names(state.lt3)<-NULL
state.lt3<-unlist(state.lt3)
school <- school[!school$state %in% state.lt3,]
# From 1472 to 1469
1) Summary of dataset
summary(school)
## rank school street city
## Min. : 2.0 Length:1469 Length:1469 Length:1469
## 1st Qu.: 472.0 Class :character Class :character Class :character
## Median : 944.0 Mode :character Mode :character Mode :character
## Mean : 966.1
## 3rd Qu.:1439.0
## Max. :2019.0
## state zipcode zillowindex lon
## Length:1469 Length:1469 Min. : 35700 Min. :-158.16
## Class :character Class :character 1st Qu.: 183200 1st Qu.:-117.86
## Mode :character Mode :character Median : 294200 Median : -86.23
## Mean : 388454 Mean : -95.03
## 3rd Qu.: 495100 3rd Qu.: -77.80
## Max. :3068900 Max. : -70.30
## lat
## Min. :20.85
## 1st Qu.:34.18
## Median :39.17
## Mean :38.23
## 3rd Qu.:41.70
## Max. :48.76
2) Total states included in the data: 34
length(unique(school$state))
## [1] 34
3) Rank: from 2 to 2019
summary(school$rank)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 472.0 944.0 966.1 1439.0 2019.0
4) zillowIndex: range from 35,700 to 3,068,900, right-skewed
describe(school$zillowindex)
## vars n mean sd median trimmed mad min max range
## 1 1 1469 388453.5 315638.3 294200 336042.1 196148 35700 3068900 3033200
## skew kurtosis se
## 1 2.72 11.9 8235.29
ggplot(school, aes(x=zillowindex)) +
geom_histogram(aes(y=..density..), binwidth=80000, color='black', fill='white') +
geom_density(alpha=.2, fill="#003399") +
ggtitle('ZillowIndex histogram overlaid with density curve') +
theme_bw()
# Ref: http://www.cookbook-r.com/Graphs/Plotting_distributions_%28ggplot2%29/
1) Geographical distribution of rank and zillowIndex
Expensive houses are located at west coast and northeast coast.
map <- get_map(location = 'United states', zoom = 4,
maptype = 'roadmap', source = 'google', color = 'color')
ggmap(map) + geom_point(aes(x=lon, y=lat, colour=zillowindex),
data=school, alpha = 0.6, size=3, na.rm = T) +
scale_color_gradient(low="#3333FF", high="#FF3300") +
ggtitle('Distribution: Zillow House Value Index')
Schools with different ranks are more evenly distributed.
ggmap(map) + geom_point(aes(x=lon, y=lat, colour=rank), data=school,
alpha = 0.8, size=3, na.rm = T) +
scale_color_gradient(low="#FF3300", high="#3333FF") +
ggtitle('Distribution: School rank')
2) Relationship between school rank and zillowIndex
ggplot(school, aes(x=rank, y=zillowindex/1000)) +
geom_point(color= "#0066CC", alpha = 0.6) + coord_trans(y = "log10") +
stat_smooth(color="#990066", size = 1, method = "lm") +
xlab('school rank') +
ylab("zillowIndex in thousand (log10 scale)") +
ggtitle('zillowIndex ~ school rank') +
theme_bw()
Correlation: weak downhill (negative) linear relationship (-0.3305684)
cor(school$zillowindex, school$rank)
## [1] -0.3305684
3) summary statistics: zillowIndex and school rank group by states
describe <- describeBy(school$zillowindex, school$state, mat=TRUE)
describe_rank <- describeBy(school$rank, school$state, mat=TRUE)
p1 <- ggplot(describe, aes(x=group1, y=mean/1000)) +
geom_bar(stat="identity", fill="#003399") +
geom_text(aes(label=n), vjust = -1) +
xlab('State') +
ylim(0, 750) +
ylab('average zillow index in thousand') +
ggtitle('Average zillow index in states') +
theme_bw()
p2 <- ggplot(describe_rank, aes(x=group1, y=mean)) +
geom_bar(stat="identity", fill="#003399") +
geom_text(aes(label=round(mean)), vjust = -1, size = 3) +
xlab('State') +
ylim(0, 1650) +
ylab('average school rank') +
ggtitle('Average school rank in states') +
theme_bw()
grid.arrange(p1, p2, nrow=2)
Correlation for each state.
school %>%
group_by(state) %>%
summarize(count= n(), cor = cor(rank, zillowindex)) %>%
arrange(cor)
## Source: local data frame [34 x 3]
##
## state count cor
## 1 NE 3 -0.91101025
## 2 VT 7 -0.88602865
## 3 AR 11 -0.80913707
## 4 HI 4 -0.69531463
## 5 NC 32 -0.67183477
## 6 AL 9 -0.61729479
## 7 OR 23 -0.58814507
## 8 MD 41 -0.58771063
## 9 MN 26 -0.58192942
## 10 VA 35 -0.57194138
## 11 WA 47 -0.56875469
## 12 CA 399 -0.51006703
## 13 TN 16 -0.47679506
## 14 MI 46 -0.45735161
## 15 MA 60 -0.45327238
## 16 PA 47 -0.44240882
## 17 OH 105 -0.42968357
## 18 IL 70 -0.42124402
## 19 GA 47 -0.40742061
## 20 NJ 36 -0.38357194
## 21 CT 37 -0.34897600
## 22 NV 7 -0.26047620
## 23 NY 110 -0.24423338
## 24 CO 40 -0.22573579
## 25 NH 13 -0.20772438
## 26 FL 80 -0.12407612
## 27 AZ 42 -0.09834842
## 28 SC 11 -0.07792724
## 29 RI 5 -0.06903855
## 30 WI 25 -0.04376106
## 31 WV 5 0.03251108
## 32 KY 10 0.11159924
## 33 OK 11 0.35832774
## 34 IA 9 0.37455415
Across the country, there is weak coorrelation between rank of school and average house value (zillow house value index) in surrounding area. By state, the correlation coefficients were highly different. Most of states show negative correlation between school and average house value.
Discussion
Although there is trend that the average house value is higher when rank of school is better, it’s not strong correlated. Other factors, such as local economy and security of the neighborhood, also play important roles on house value. This analysis was only based on the top 10% of total public high schools (1469 schools in 34 states out of around 20,000 public high schools in the USA). The results may not represent the overall population.
New features
ggmap
psych: describe(), describeBy()
Challenges
rvest package: missing NULL values when unlist. Had to transform NULL into NA.
Zillow API only allows 1000 queries per day.
XML package: Xpath query.
Appendix
Appendix 1. Plot of zillowIndex and rank in Arkansas (cor = -0.80913707)
school %>% filter(state == 'AR') %>%
ggplot(aes(x=rank, y=zillowindex/1000)) +
geom_point(color= "#0066CC", alpha = 0.6) +
stat_smooth(color="#990066", size = 1, method = "lm", se=FALSE) +
xlab('school rank') +
ylab("zillowIndex in thousands") +
ggtitle('zillowIndex ~ school rank in Arkansas') +
theme_bw()
Appendix 2. Bin the zillowIndex into three catogeries: low (below Q1), medium (Q1 to Q3) and high (above Q3).
school$valuelevel <- cut(school$zillowindex, breaks = c(0, 183200, 495100, 3069000),
labels=c("Low","Medium","High"), include.lowest=TRUE)
ggplot(school, aes(x=rank, y=zillowindex/10000)) +
geom_point(alpha = 0.6) +
stat_smooth(method = "lm") +
facet_wrap(~valuelevel, scales = "free_y") +
xlab('school rank') +
ylab('zillowIndex in thousands')
Medium and high zillowIndex tend to have negative correlation, while low zillowIndex didn’t show much correlation.
ggplot(school, aes(x=valuelevel, y=rank)) +
geom_boxplot() +
xlab('zillowIndex') +
ylab('school rank')
Correlation for each house value level.
school %>%
group_by(valuelevel) %>%
summarize(count= n(), cor = cor(rank, zillowindex))
## Source: local data frame [3 x 3]
##
## valuelevel count cor
## 1 Low 368 0.08310833
## 2 Medium 734 -0.16234948
## 3 High 367 -0.21903071
No strong correlation for each group.